{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-05-11T08:21:34.460000Z",
     "start_time": "2019-05-11T08:21:32.766741Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/lightgbm/__init__.py:46: UserWarning: Starting from version 2.2.1, the library file in distribution wheels for macOS is built by the Apple Clang (Xcode_8.3.3) compiler.\n",
      "This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.\n",
      "Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.\n",
      "You can install the OpenMP library by the following command: ``brew install libomp``.\n",
      "  \"You can install the OpenMP library by the following command: ``brew install libomp``.\", UserWarning)\n"
     ]
    }
   ],
   "source": [
    "import ast\n",
    "from itertools import product\n",
    "\n",
    "from sklearn.decomposition import TruncatedSVD\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "from sklearn.metrics import f1_score\n",
    "from sklearn.metrics import accuracy_score, recall_score, precision_score\n",
    "from sklearn.metrics import accuracy_score\n",
    "\n",
    "import json\n",
    "from tqdm import tqdm\n",
    "import warnings\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import lightgbm as lgb\n",
    "import os\n",
    "\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "# project_path='./Competitions/KDD-Cup-2019-CAMMTR/'\n",
    "project_path = '../../'\n",
    "mainpath = project_path + r'Data/data_set_phase1/'\n",
    "\n",
    "\n",
    "holidays = pd.Series(\n",
    "    json.load(open(project_path+r'Config/Holidays.json'))['holidays'])\n",
    "subwayinfo = pd.read_csv(project_path+r'Data/Beijing.csv')\n",
    "weatherinfo =pd.read_csv(project_path+r'Data/Weather.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-05-11T08:29:02.598615Z",
     "start_time": "2019-05-11T08:28:57.935295Z"
    }
   },
   "outputs": [],
   "source": [
    "train_queries = pd.read_csv(\n",
    "    mainpath + 'train_queries.csv', parse_dates=['req_time'])\n",
    "train_plans = pd.read_csv(mainpath + 'train_plans.csv',\n",
    "                          parse_dates=['plan_time'])\n",
    "train_clicks = pd.read_csv(mainpath + 'train_clicks.csv')\n",
    "profiles = pd.read_csv(mainpath + 'profiles.csv')\n",
    "test_queries = pd.read_csv(\n",
    "    mainpath + 'test_queries.csv', parse_dates=['req_time'])\n",
    "test_plans = pd.read_csv(mainpath + 'test_plans.csv',\n",
    "                         parse_dates=['plan_time'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-05-11T08:29:16.850580Z",
     "start_time": "2019-05-11T08:29:15.999403Z"
    }
   },
   "outputs": [],
   "source": [
    "train = train_queries.merge(train_plans, 'left', ['sid'])\n",
    "test = test_queries.merge(test_plans, 'left', ['sid'])\n",
    "train = train.merge(train_clicks, 'left', ['sid'])\n",
    "train['click_mode'] = train['click_mode'].fillna(0).astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-05-11T08:29:19.337191Z",
     "start_time": "2019-05-11T08:29:18.320836Z"
    }
   },
   "outputs": [],
   "source": [
    "data = pd.concat([train, test], ignore_index=True)\n",
    "data = data.merge(profiles, 'left', ['pid'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-05-11T08:29:22.347798Z",
     "start_time": "2019-05-11T08:29:20.789121Z"
    }
   },
   "outputs": [],
   "source": [
    "data['o_lng'] = data['o'].apply(lambda x: float(x.split(',')[0]))\n",
    "data['o_lat'] = data['o'].apply(lambda x: float(x.split(',')[1]))\n",
    "data['d_lng'] = data['d'].apply(lambda x: float(x.split(',')[0]))\n",
    "data['d_lat'] = data['d'].apply(lambda x: float(x.split(',')[1])) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-05-11T08:29:34.353092Z",
     "start_time": "2019-05-11T08:29:34.284019Z"
    }
   },
   "outputs": [],
   "source": [
    "time_feature = []\n",
    "for i in ['req_time']:\n",
    "    data[i + '_hour'] = data[i].dt.hour\n",
    "    data[i + '_weekday'] = data[i].dt.weekday\n",
    "    time_feature.append(i + '_hour')\n",
    "    time_feature.append(i + '_weekday') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-05-11T08:30:09.535274Z",
     "start_time": "2019-05-11T08:30:05.715485Z"
    }
   },
   "outputs": [],
   "source": [
    "data['time_diff'] = data['plan_time'].astype(\n",
    "    int) - data['req_time'].astype(int)\n",
    "time_feature.append('time_diff')\n",
    "\n",
    "data['req_date'] = data['req_time'].dt.strftime('%m-%d')\n",
    "data['if_holiday'] = (data['req_date'].isin(holidays)).astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-05-11T08:30:20.565290Z",
     "start_time": "2019-05-11T08:30:20.540861Z"
    }
   },
   "outputs": [],
   "source": [
    "def gen_plan_feas(data):\n",
    "    n = data.shape[0]\n",
    "    mode_list_feas = np.zeros((n, 12))\n",
    "    max_dist, min_dist, mean_dist, std_dist = np.zeros(\n",
    "        (n,)), np.zeros((n,)), np.zeros((n,)), np.zeros((n,))\n",
    "\n",
    "    max_price, min_price, mean_price, std_price = np.zeros(\n",
    "        (n,)), np.zeros((n,)), np.zeros((n,)), np.zeros((n,))\n",
    "\n",
    "    max_eta, min_eta, mean_eta, std_eta = np.zeros(\n",
    "        (n,)), np.zeros((n,)), np.zeros((n,)), np.zeros((n,))\n",
    "\n",
    "    min_dist_mode, max_dist_mode, min_price_mode, max_price_mode, min_eta_mode, max_eta_mode, first_mode = \\\n",
    "        np.zeros((n,)), np.zeros((n,)), np.zeros((n,)), np.zeros(\n",
    "            (n,)), np.zeros((n,)), np.zeros((n,)), np.zeros((n,))\n",
    "\n",
    "    mode_texts = []\n",
    "    for i, plan in tqdm(enumerate(data['plans_json'].values)):\n",
    "        if len(plan) == 0:\n",
    "            cur_plan_list = []\n",
    "        else:\n",
    "            cur_plan_list = plan\n",
    "        if len(cur_plan_list) == 0:\n",
    "            mode_list_feas[i, 0] = 1\n",
    "            first_mode[i] = 0\n",
    "            max_dist[i] = -1\n",
    "            min_dist[i] = -1\n",
    "            mean_dist[i] = -1\n",
    "            std_dist[i] = -1\n",
    "            max_price[i] = -1\n",
    "            min_price[i] = -1\n",
    "            mean_price[i] = -1\n",
    "            std_price[i] = -1\n",
    "            max_eta[i] = -1\n",
    "            min_eta[i] = -1\n",
    "            mean_eta[i] = -1\n",
    "            std_eta[i] = -1\n",
    "            min_dist_mode[i] = -1\n",
    "            max_dist_mode[i] = -1\n",
    "            min_price_mode[i] = -1\n",
    "            max_price_mode[i] = -1\n",
    "            min_eta_mode[i] = -1\n",
    "            max_eta_mode[i] = -1\n",
    "            mode_texts.append('word_null')\n",
    "        else:\n",
    "            distance_list = []\n",
    "            price_list = []\n",
    "            eta_list = []\n",
    "            mode_list = []\n",
    "            for tmp_dit in cur_plan_list:\n",
    "                distance_list.append(int(tmp_dit['distance']))\n",
    "                if tmp_dit['price'] == '':\n",
    "                    price_list.append(0)\n",
    "                else:\n",
    "                    price_list.append(int(tmp_dit['price']))\n",
    "                eta_list.append(int(tmp_dit['eta']))\n",
    "                mode_list.append(int(tmp_dit['transport_mode']))\n",
    "            mode_texts.append(\n",
    "                ' '.join(['word_{}'.format(mode) for mode in mode_list]))\n",
    "            distance_list = np.array(distance_list)\n",
    "            price_list = np.array(price_list)\n",
    "            eta_list = np.array(eta_list)\n",
    "            mode_list = np.array(mode_list, dtype='int')\n",
    "            mode_list_feas[i, mode_list] = 1\n",
    "            distance_sort_idx = np.argsort(distance_list)\n",
    "            price_sort_idx = np.argsort(price_list)\n",
    "            eta_sort_idx = np.argsort(eta_list)\n",
    "            max_dist[i] = distance_list[distance_sort_idx[-1]]\n",
    "            min_dist[i] = distance_list[distance_sort_idx[0]]\n",
    "            mean_dist[i] = np.mean(distance_list)\n",
    "            std_dist[i] = np.std(distance_list)\n",
    "            max_price[i] = price_list[price_sort_idx[-1]]\n",
    "            min_price[i] = price_list[price_sort_idx[0]]\n",
    "            mean_price[i] = np.mean(price_list)\n",
    "            std_price[i] = np.std(price_list)\n",
    "            max_eta[i] = eta_list[eta_sort_idx[-1]]\n",
    "            min_eta[i] = eta_list[eta_sort_idx[0]]\n",
    "            mean_eta[i] = np.mean(eta_list)\n",
    "            std_eta[i] = np.std(eta_list)\n",
    "            first_mode[i] = mode_list[0]\n",
    "            max_dist_mode[i] = mode_list[distance_sort_idx[-1]]\n",
    "            min_dist_mode[i] = mode_list[distance_sort_idx[0]]\n",
    "            max_price_mode[i] = mode_list[price_sort_idx[-1]]\n",
    "            min_price_mode[i] = mode_list[price_sort_idx[0]]\n",
    "            max_eta_mode[i] = mode_list[eta_sort_idx[-1]]\n",
    "            min_eta_mode[i] = mode_list[eta_sort_idx[0]]\n",
    "    feature_data = pd.DataFrame(mode_list_feas)\n",
    "    feature_data.columns = ['mode_feas_{}'.format(i) for i in range(12)]\n",
    "    feature_data['max_dist'] = max_dist\n",
    "    feature_data['min_dist'] = min_dist\n",
    "    feature_data['mean_dist'] = mean_dist\n",
    "    feature_data['std_dist'] = std_dist\n",
    "    feature_data['max_price'] = max_price\n",
    "    feature_data['min_price'] = min_price\n",
    "    feature_data['mean_price'] = mean_price\n",
    "    feature_data['std_price'] = std_price\n",
    "    feature_data['max_eta'] = max_eta\n",
    "    feature_data['min_eta'] = min_eta\n",
    "    feature_data['mean_eta'] = mean_eta\n",
    "    feature_data['std_eta'] = std_eta\n",
    "    feature_data['max_dist_mode'] = max_dist_mode\n",
    "    feature_data['min_dist_mode'] = min_dist_mode\n",
    "    feature_data['max_price_mode'] = max_price_mode\n",
    "    feature_data['min_price_mode'] = min_price_mode\n",
    "    feature_data['max_eta_mode'] = max_eta_mode\n",
    "    feature_data['min_eta_mode'] = min_eta_mode\n",
    "    feature_data['first_mode'] = first_mode\n",
    "    print('mode tfidf...')\n",
    "    tfidf_enc = TfidfVectorizer(ngram_range=(1, 2))\n",
    "    tfidf_vec = tfidf_enc.fit_transform(mode_texts)\n",
    "    svd_enc = TruncatedSVD(n_components=10, n_iter=20, random_state=2019)\n",
    "    mode_svd = svd_enc.fit_transform(tfidf_vec)\n",
    "    mode_svd = pd.DataFrame(mode_svd)\n",
    "    mode_svd.columns = ['svd_mode_{}'.format(i) for i in range(10)]\n",
    "    plan_fea = pd.concat([feature_data, mode_svd], axis=1)\n",
    "    plan_fea['sid'] = data['sid'].values\n",
    "    return plan_fea\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-05-11T08:30:24.612800Z",
     "start_time": "2019-05-11T08:30:24.606921Z"
    }
   },
   "outputs": [],
   "source": [
    "def trans_features(data):\n",
    "    o_co = data[['o']]\n",
    "    d_co = data[['d']]\n",
    "\n",
    "    o_co.columns = ['co']\n",
    "    d_co.columns = ['co']\n",
    "\n",
    "\n",
    "    data['o_nearest_dis'] = np.nan\n",
    "    data['d_nearest_dis'] = np.nan\n",
    "\n",
    "\n",
    "    all_co = pd.concat([d_co, o_co])['co'].unique()\n",
    "\n",
    "    for co in tqdm(all_co):\n",
    "        lg, la = co.split(',')\n",
    "        min_dis = (abs(subwayinfo['station_longitude']-float(lg)) +\n",
    "                   abs(subwayinfo['station_latitude']-float(la))).min()\n",
    "        data.loc[(data['o'] == co), 'o_nearest_dis'] = min_dis\n",
    "        data.loc[(data['d'] == co), 'd_nearest_dis'] = min_dis\n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-05-11T08:32:28.415162Z",
     "start_time": "2019-05-11T08:30:27.379467Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "594358it [01:35, 6192.52it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mode tfidf...\n"
     ]
    }
   ],
   "source": [
    "data['plans_json'] = data['plans'].fillna('[]').apply(lambda x: json.loads(x))\n",
    "\n",
    "data_plans = gen_plan_feas(data)\n",
    "plan_features = [col for col in data_plans.columns if col not in ['sid']]\n",
    "data = data.merge(data_plans, on='sid', how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-05-11T08:40:11.479645Z",
     "start_time": "2019-05-11T08:33:13.110366Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 6525/6525 [06:56<00:00, 17.06it/s]\n"
     ]
    }
   ],
   "source": [
    "data = trans_features(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-05-11T08:41:46.616542Z",
     "start_time": "2019-05-11T08:41:46.611793Z"
    }
   },
   "outputs": [],
   "source": [
    "def f1_weighted(labels,preds):\n",
    "    preds = np.argmax(preds.reshape(12, -1), axis=0)\n",
    "    score = f1_score(y_true=labels, y_pred=preds, average='weighted')\n",
    "    return 'f1_weighted', score, True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-05-11T09:02:45.061050Z",
     "start_time": "2019-05-11T09:02:45.053128Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "119 ['o_lng', 'o_lat', 'd_lng', 'd_lat', 'if_holiday', 'o_nearest_dis', 'o_nearest_dis', 'p0', 'p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8', 'p9', 'p10', 'p11', 'p12', 'p13', 'p14', 'p15', 'p16', 'p17', 'p18', 'p19', 'p20', 'p21', 'p22', 'p23', 'p24', 'p25', 'p26', 'p27', 'p28', 'p29', 'p30', 'p31', 'p32', 'p33', 'p34', 'p35', 'p36', 'p37', 'p38', 'p39', 'p40', 'p41', 'p42', 'p43', 'p44', 'p45', 'p46', 'p47', 'p48', 'p49', 'p50', 'p51', 'p52', 'p53', 'p54', 'p55', 'p56', 'p57', 'p58', 'p59', 'p60', 'p61', 'p62', 'p63', 'p64', 'p65', 'pid', 'mode_feas_0', 'mode_feas_1', 'mode_feas_2', 'mode_feas_3', 'mode_feas_4', 'mode_feas_5', 'mode_feas_6', 'mode_feas_7', 'mode_feas_8', 'mode_feas_9', 'mode_feas_10', 'mode_feas_11', 'max_dist', 'min_dist', 'mean_dist', 'std_dist', 'max_price', 'min_price', 'mean_price', 'std_price', 'max_eta', 'min_eta', 'mean_eta', 'std_eta', 'max_dist_mode', 'min_dist_mode', 'max_price_mode', 'min_price_mode', 'max_eta_mode', 'min_eta_mode', 'first_mode', 'svd_mode_0', 'svd_mode_1', 'svd_mode_2', 'svd_mode_3', 'svd_mode_4', 'svd_mode_5', 'svd_mode_6', 'svd_mode_7', 'svd_mode_8', 'svd_mode_9', 'req_time_hour', 'req_time_weekday', 'time_diff', 'time_diff']\n"
     ]
    }
   ],
   "source": [
    "profile_feature = ['p' + str(i) for i in range(66)]\n",
    "origin_num_feature = ['o_lng', 'o_lat', 'd_lng', 'd_lat',\n",
    "                      'if_holiday', 'o_nearest_dis', 'o_nearest_dis'] + profile_feature\n",
    "cate_feature = ['pid']\n",
    "feature = origin_num_feature + cate_feature + plan_features + time_feature\n",
    "print(len(feature), feature)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-05-11T09:02:54.788841Z",
     "start_time": "2019-05-11T09:02:50.738680Z"
    }
   },
   "outputs": [],
   "source": [
    "train_index = (data.req_time < '2018-11-23')\n",
    "train_x = data[train_index][feature].reset_index(drop=True)\n",
    "train_y = data[train_index].click_mode.reset_index(drop=True)\n",
    "\n",
    "valid_index = (data.req_time > '2018-11-23') & (data.req_time < '2018-12-01')\n",
    "valid_x = data[valid_index][feature].reset_index(drop=True)\n",
    "valid_y = data[valid_index].click_mode.reset_index(drop=True)\n",
    "\n",
    "test_index = (data.req_time > '2018-12-01')\n",
    "test_x = data[test_index][feature].reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-05-11T09:06:24.325796Z",
     "start_time": "2019-05-11T09:02:56.578577Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training until validation scores don't improve for 100 rounds.\n",
      "[10]\tvalid_0's f1_weighted: 0.628442\n",
      "[20]\tvalid_0's f1_weighted: 0.676041\n",
      "[30]\tvalid_0's f1_weighted: 0.682397\n",
      "[40]\tvalid_0's f1_weighted: 0.683972\n",
      "[50]\tvalid_0's f1_weighted: 0.685252\n",
      "[60]\tvalid_0's f1_weighted: 0.685782\n",
      "[70]\tvalid_0's f1_weighted: 0.686445\n",
      "[80]\tvalid_0's f1_weighted: 0.686626\n",
      "[90]\tvalid_0's f1_weighted: 0.686873\n",
      "[100]\tvalid_0's f1_weighted: 0.687327\n",
      "[110]\tvalid_0's f1_weighted: 0.687457\n",
      "[120]\tvalid_0's f1_weighted: 0.687518\n",
      "[130]\tvalid_0's f1_weighted: 0.687687\n",
      "[140]\tvalid_0's f1_weighted: 0.687587\n",
      "[150]\tvalid_0's f1_weighted: 0.687683\n",
      "[160]\tvalid_0's f1_weighted: 0.687362\n",
      "[170]\tvalid_0's f1_weighted: 0.687197\n",
      "[180]\tvalid_0's f1_weighted: 0.687468\n",
      "[190]\tvalid_0's f1_weighted: 0.687257\n",
      "[200]\tvalid_0's f1_weighted: 0.687018\n",
      "[210]\tvalid_0's f1_weighted: 0.686813\n",
      "[220]\tvalid_0's f1_weighted: 0.686691\n",
      "[230]\tvalid_0's f1_weighted: 0.686507\n",
      "[240]\tvalid_0's f1_weighted: 0.686479\n",
      "Early stopping, best iteration is:\n",
      "[149]\tvalid_0's f1_weighted: 0.687848\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.8,\n",
       "        importance_type='split', learning_rate=0.05, max_depth=-1,\n",
       "        metric='None', min_child_samples=50, min_child_weight=0.001,\n",
       "        min_split_gain=0.0, n_estimators=2000, n_jobs=-1, num_leaves=61,\n",
       "        objective='multiclass', random_state=2019, reg_alpha=0,\n",
       "        reg_lambda=0.01, silent=True, subsample=0.8,\n",
       "        subsample_for_bin=200000, subsample_freq=1)"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lgb_model = lgb.LGBMClassifier(boosting_type=\"gbdt\", num_leaves=61, reg_alpha=0, reg_lambda=0.01,\n",
    "                               max_depth=-1, n_estimators=2000, objective='multiclass',\n",
    "                               subsample=0.8, colsample_bytree=0.8, subsample_freq=1, min_child_samples=50,  learning_rate=0.05, random_state=2019, metric=\"None\", n_jobs=-1)\n",
    "eval_set = [(valid_x, valid_y)]\n",
    "lgb_model.fit(train_x, train_y, eval_set=eval_set, eval_metric=f1_weighted,\n",
    "              categorical_feature=cate_feature, verbose=10, early_stopping_rounds=100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pred = lgb_model.predict(valid_x) \n",
    "df_analysis = pd.DataFrame()\n",
    "df_analysis['sid']   = data[valid_index]['sid']\n",
    "df_analysis['label'] = valid_y.values\n",
    "df_analysis['pred']  = pred\n",
    "df_analysis['label'] = df_analysis['label'].astype(int)\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.metrics import accuracy_score,recall_score,precision_score\n",
    "dic_ = df_analysis['label'].value_counts(normalize = True)\n",
    "def get_weighted_fscore(y_pred, y_true):\n",
    "    f_score = 0\n",
    "    for i in range(12):\n",
    "        yt = y_true == i\n",
    "        yp = y_pred == i\n",
    "        f_score += dic_[i] * f1_score(y_true=yt, y_pred= yp)\n",
    "        print(i,dic_[i],f1_score(y_true=yt, y_pred= yp), precision_score(y_true=yt, y_pred= yp),recall_score(y_true=yt, y_pred= yp))\n",
    "    print(f_score)\n",
    "get_weighted_fscore(y_true =df_analysis['label'] , y_pred = df_analysis['pred'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_train_x = data[data.req_time <\n",
    "                   '2018-12-01'][feature].reset_index(drop=True)\n",
    "all_train_y = data[data.req_time <\n",
    "                   '2018-12-01'].click_mode.reset_index(drop=True)\n",
    "print(lgb_model.best_iteration_)\n",
    "lgb_model.n_estimators = lgb_model.best_iteration_\n",
    "lgb_model.fit(all_train_x, all_train_y, categorical_feature=cate_feature)\n",
    "print('fit over')\n",
    "result = pd.DataFrame()\n",
    "result['sid'] = data[test_index]['sid']\n",
    "result['recommend_mode'] = lgb_model.predict(test_x)\n",
    "result['recommend_mode'] = result['recommend_mode'].astype(int)\n",
    "print(len(result))\n",
    "print(result['recommend_mode'].value_counts())\n",
    "result[['sid', 'recommend_mode']].to_csv(\n",
    "    path + '/sub/baseline.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
